#!/usr/bin/python
import code,pickle,sys,os
import ocropy
from pylab import *
from optparse import OptionParser

parser = OptionParser("""
usage: %prog [options] .../.../010001.png ...

Extract character images from OCR output.  This assume that for each
line.png, there is a line.cseg.gt.png and line.gt.txt file.
""")
parser.add_option("-o","--output",help="output file",default="clusters.db")
parser.add_option("-u","--unmerged",help="unmerged output file",default=None)
parser.add_option("-m","--missegmented",help="output missegmented characters",action="store_true")
parser.add_option("-r","--raw",help="output unlabeled characters",action="store_true")
parser.add_option("-a","--maxage",help="output missegmented",default=10000000,type="int")
(options,args) = parser.parse_args()

if len(args)<1:
    parser.print_help()
    sys.exit(0)

ion()
show()

if os.path.exists(options.output):
    print options.output,"exists; please remove"
    sys.exit(1)

if os.path.exists(options.output+".temp"):
    os.unlink(options.output+".temp")

binned = ocropy.BinnedNN()
total = 0
for raw,mask,cls in ocropy.cseg_chars(args):
    if cls is None:
        # no ground truth
        if not options.raw: continue
        cls = "_"
    elif cls<=0 or cls=="":
        # missegmented
        if not options.missegmented: continue
        cls = "~"
    raw = ocropy.NI(raw)
    if raw.shape[0]>255 or raw.shape[1]>255: continue
    raw = raw/float(amax(raw))
    binned.add(raw,cls)
    total+=1
    if total%100==0:
        print total,"chars"
        print binned.stats()
    if total%options.maxage==0:
        print "removed",binned.collect(options.maxage),"clusters"
    if total%10000==0:
        # this is just for "looking in" on the progress
        print "saving after",total,"chars"
        binned.save(options.output+".temp")

if options.unmerged is not None:
    binned.save(options.unmerged)

# binned.remerge()

binned.save(options.output)
